While I was looking for an interesting project to work on, I came across a Kaggle post called “Tweets about the Top Companies from 2015 and 2020”.
(link : https://www.kaggle.com/omermetinn/tweets-about-the-top-companies-from-2015-to-2020)
It includes the following three data sets.
twt_to_cmpy <- read.csv(paste(mymachine,'/archive/Company_Tweet.csv', sep=""))
stk <- read.csv(paste(mymachine,'/archive/CompanyValues.csv', sep=''))
twt <- read.csv(paste(mymachine,'/archive/Tweet.csv', sep=''))
Using these, I intend to analyze the correlation between Tweets and company’s stock.
library(easypackages)
libraries(
"dplyr",
"lubridate",
"ggplot2",
"tidyr",
"ggpubr",
"ggupset",
"gt",
"scales"
)
str(twt_to_cmpy)
paste(
"number of distinct tweets =",
n_distinct(twt_to_cmpy$tweet_id)
)
paste(
"company being referred =",
paste(unique(twt_to_cmpy$ticker_symbol), collapse = ", ")
)
paste(
"number of NA or blanks in tweet_id =",
sum(
is.na(twt_to_cmpy$tweet_id), nrow(twt_to_cmpy[twt_to_cmpy$tweet_id == "", ])
)
)
paste(
"number of NA or blanks in ticker_symbol =",
sum(
is.na(twt_to_cmpy$ticker_symbol), nrow(twt_to_cmpy[twt_to_cmpy$ticker_symbol == "", ])
)
)
## 'data.frame': 4336445 obs. of 2 variables:
## $ tweet_id : num 5.51e+17 5.51e+17 5.51e+17 5.51e+17 5.51e+17 ...
## $ ticker_symbol: chr "AAPL" "AAPL" "AAPL" "AAPL" ...
## [1] "number of distinct tweets = 3716836"
## [1] "company being referred = AAPL, GOOG, GOOGL, AMZN, MSFT, TSLA"
## [1] "number of NA or blanks in tweet_id = 0"
## [1] "number of NA or blanks in ticker_symbol = 0"
This data set contains two columns.
And based on the setup, it seems to show which tweet posts are affecting which company.
There isn’t much cleaning to do on this data, but it seems like there are duplicated tweet_id. (distinct number of tweets less than the total row).
Does this mean tweets can be related to more than one company at a time?
Let’s find out.
rf_cmpy <- twt_to_cmpy %>%
group_by(tweet_id) %>%
summarize(com_ct = n_distinct(ticker_symbol)) %>%
count(com_ct)
rf_cmpy %>%
gt() %>%
tab_header(
title = "Tweet Counts by Tagged Companies Count"
) %>%
fmt_number(
columns = "n",
sep_mark = ","
) %>%
cols_label(
com_ct = "Tagged Companies Count",
n = "Tweet Counts"
)
| Tweet Counts by Tagged Companies Count | |
|---|---|
| Tagged Companies Count | Tweet Counts |
| 1 | 3,295,212.00 |
| 2 | 277,578.00 |
| 3 | 101,539.00 |
| 4 | 32,186.00 |
| 5 | 9,820.00 |
| 6 | 501.00 |
paste(
"correct row number should be",
sum(rf_cmpy$com_ct * rf_cmpy$n)
)
## [1] "correct row number should be 4335835"
According to the above result, tweets can be related to multiple companies at a time.
Also, the data set should have 0 rows but the table actually has 4,336,445.
This means some rows are duplicated and that needs to be eliminated.
twt_to_cmpy <- unique(twt_to_cmpy)
nrow(twt_to_cmpy)
## [1] 4335835
Now, let’s look into which company/combinations of companies are being referred the most.
twt_to_cmpy_ut <- twt_to_cmpy %>%
group_by(tweet_id) %>%
summarize(company = paste(ticker_symbol, collapse = "&"))
twt_to_cmpy_u <- twt_to_cmpy %>%
group_by(tweet_id) %>%
summarize(company = list(ticker_symbol))
twt_to_cmpy_u %>%
ggplot(aes(x = company)) +
geom_bar() +
geom_text(
stat = 'count',
aes(label = format(after_stat(count), big.mark = ",")),
size = 2.5,
angle = 75,
hjust = 0
) +
scale_x_upset() +
scale_y_continuous(
limits = c(0,1300000),
labels = function(x) {format(x, big.mark=",", scientific=FALSE)}
) +
labs(
title = "Tweet Count by Tagged Company Combinations",
x = "Company Combination",
y = "Tweet Count"
) +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
According to the above, AAPL is the most tagged company while GOOGL is the least.
Also, there is a significant drop in tweet counts when more than one companies are being tagged in a tweet.
I will create a column called “mult_tag” where 0 will mean the tweet only tags one company while 1 means it has tagged multiple.
twt_to_cmpy <- twt_to_cmpy %>%
group_by(tweet_id) %>%
mutate(mult_tag = ifelse(n() > 1, 1, 0))
str(stk)
stk$day_date <- as.Date(stk$day_date)
paste(
"Date Range = min:", min(stk$day_date),
"max:", max(stk$day_date)
)
paste(
"companies included in the data set =",
paste(unique(stk$ticker_symbol), collapse=", ")
)
paste(
"number of NA or blanks in ticker_symbol =",
sum(
is.na(stk$ticker_symbol), nrow(stk[stk$ticker_symbol == "", ])
)
)
paste(
"number of NA in day_date =",
sum(is.na(stk$day_date))
)
paste(
"number of NA or blanks in close_value =",
sum(
is.na(stk$close_value), nrow(stk[stk$close_value == "", ])
)
)
paste(
"number of NA or blanks in open_value =",
sum(
is.na(stk$open_value), nrow(stk[stk$open_value == "", ])
)
)
paste(
"number of NA or blanks in high_value =",
sum(
is.na(stk$high_value), nrow(stk[stk$high_value == "", ])
)
)
paste(
"number of NA or blanks in low_value =",
sum(
is.na(stk$low_value), nrow(stk[stk$low_value == "", ])
)
)
paste(
"number of NA or blanks in volume =",
sum(
is.na(stk$volume), nrow(stk[stk$volume == "", ])
)
)
## 'data.frame': 17528 obs. of 7 variables:
## $ ticker_symbol: chr "AAPL" "AAPL" "AAPL" "AAPL" ...
## $ day_date : chr "2020-05-29" "2020-05-28" "2020-05-27" "2020-05-26" ...
## $ close_value : num 318 318 318 317 319 ...
## $ volume : int 38399530 33449100 28236270 31380450 20450750 25672210 27876220 25432390 33843130 41587090 ...
## $ open_value : num 319 317 316 324 316 ...
## $ high_value : num 321 323 319 324 319 ...
## $ low_value : num 316 316 313 316 315 ...
## [1] "Date Range = min: 2010-06-01 max: 2020-05-29"
## [1] "companies included in the data set = AAPL, AMZN, GOOGL, GOOG, MSFT, TSLA"
## [1] "number of NA or blanks in ticker_symbol = 0"
## [1] "number of NA in day_date = 0"
## [1] "number of NA or blanks in close_value = 0"
## [1] "number of NA or blanks in open_value = 0"
## [1] "number of NA or blanks in high_value = 0"
## [1] "number of NA or blanks in low_value = 0"
## [1] "number of NA or blanks in volume = 0"
Looking at the structure of the data set, it doesn’t seem to require much cleaning either.
But just to make sure, I will quickly check if the data includes stock info for every stocks each day.
stk %>%
group_by(day_date) %>%
summarize(d_com_ct = n_distinct(ticker_symbol)) %>%
arrange(d_com_ct, descending = FALSE) %>%
count(d_com_ct) %>%
rename(
"Number of Days" = n,
"Tagged Companies Count" = d_com_ct
) %>%
gt() %>%
tab_header(
title = "Number of Days by Tagged Companies Count"
) %>%
fmt_number(
columns = "Number of Days",
sep_mark = ",",
decimals = 0
)
| Number of Days by Tagged Companies Count | |
|---|---|
| Tagged Companies Count | Number of Days |
| 4 | 20 |
| 5 | 942 |
| 6 | 2,123 |
It seems like there are days where some companies’ stock data is not recorded.
Let’s take a deeper look into it.
stk %>%
group_by(ticker_symbol) %>%
summarize("Min. Date" = min(day_date), "Max. Date" = max(day_date)) %>%
gt() %>%
tab_header(
title = "First and Last Date Recorded for Each Comapny"
) %>%
cols_label(
ticker_symbol = "Company"
)
| First and Last Date Recorded for Each Comapny | ||
|---|---|---|
| Company | Min. Date | Max. Date |
| AAPL | 2010-06-01 | 2020-05-29 |
| AMZN | 2010-06-01 | 2020-05-29 |
| GOOG | 2014-03-27 | 2020-05-29 |
| GOOGL | 2010-06-01 | 2020-05-29 |
| MSFT | 2010-06-01 | 2020-05-29 |
| TSLA | 2010-06-29 | 2020-05-29 |
stk %>%
group_by(day_date) %>%
summarize(daily_recorded_companies = n_distinct(ticker_symbol)) %>%
group_by(daily_recorded_companies) %>%
summarize("Min. Date" = min(day_date), "Max. Date" = max(day_date)) %>%
gt() %>%
tab_header(
title = "First and Last Date Recorded for Tagged Companies Count"
) %>%
cols_label(
daily_recorded_companies = "Tagged Companies Count"
)
| First and Last Date Recorded for Tagged Companies Count | ||
|---|---|---|
| Tagged Companies Count | Min. Date | Max. Date |
| 4 | 2010-06-01 | 2010-06-28 |
| 5 | 2010-06-29 | 2014-03-26 |
| 6 | 2014-03-27 | 2020-05-29 |
Data for GOOG and TSLA starts on different days.
But other than that, every company seems to have continuous data through out.
Now, let’s look at how the stock price changes over time.
Close value will be utilized as the stock prices.
stk %>%
ggplot(aes(x = day_date, group = ticker_symbol, color = ticker_symbol)) +
geom_line(aes(y = close_value)) +
geom_smooth(aes(y = close_value), method = "lm", se = FALSE, linetype = "dashed", size = 0.5) +
stat_regline_equation(
aes(y = close_value, label = paste(..eq.label.., ..rr.label.., sep="~~~")),
show.legend = FALSE,
label.x.npc = 0.1,
label.y.npc = 1,
size = 3.5
) +
scale_y_continuous(labels = function(x) {paste(format(x, suffix = "$", big.mark = ","), "$")}) +
labs(
title = "Stock Prices Over Time by Ticker Symbols",
x = "Date",
y = "Closing Price",
color = "Legend"
) +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 18))
Stock prices for every companies have increased over time.
AMZN had the biggest increase while MSFT has the smallest.
GOOG and GOOGL seems to generally have the same value over time.
Now, let’s take a look at how the traded volume changes over time.
stk %>%
ggplot(aes(x = day_date, group = ticker_symbol, color = ticker_symbol)) +
geom_line(aes(y = volume)) +
scale_y_continuous(labels = function(x) {format(x, big.mark = ",", scientific=FALSE)}) +
labs(
title = "Volume Traded Over Time by Ticker Symbols",
x = "Date",
y = "Traded Stock Volume",
color = "Legend"
) +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 18))
While MSFT and AAPL has comparatively lower close value, they are being traded more than any other companies.
str(twt)
twt$post_datetime <- as_datetime(twt$post_date)
twt$post_date <- as_date(twt$post_datetime)
paste(
"Date Range = min:", min(twt$post_date),
"max:", max(twt$post_date)
)
paste(
"number of distinct tweets =",
n_distinct(twt$tweet_id)
)
paste(
"number of NA or blanks in tweet_id =",
sum(
is.na(twt$tweet_id), nrow(twt[twt$tweet_id == "", ])
)
)
paste(
"number of NA or blanks in writer =",
sum(
is.na(twt$writer), nrow(twt[twt$writer == "", ])
)
)
paste(
"number of NA in post_date =",
sum(is.na(twt$post_date))
)
paste(
"number of NA or blanks in body =",
sum(
is.na(twt$body), nrow(twt[twt$body == "", ])
)
)
paste(
"number of NA or blanks in comment_num =",
sum(
is.na(twt$comment_num), nrow(twt[twt$comment_num == "", ])
)
)
paste(
"number of NA or blanks in retweet_num =",
sum(
is.na(twt$retweet_num), nrow(twt[twt$retweet_num == "", ])
)
)
paste(
"number of NA or blanks in like_num =",
sum(
is.na(twt$like_num), nrow(twt[twt$like_num == "", ])
)
)
paste(
"number of NA in post_datetime =",
sum(is.na(twt$post_datetime))
)
## 'data.frame': 3717964 obs. of 7 variables:
## $ tweet_id : num 5.5e+17 5.5e+17 5.5e+17 5.5e+17 5.5e+17 ...
## $ writer : chr "VisualStockRSRC" "KeralaGuy77" "DozenStocks" "ShowDreamCar" ...
## $ post_date : int 1420070457 1420070496 1420070510 1420070807 1420071005 1420071005 1420071005 1420071016 1420071017 1420071017 ...
## $ body : chr "lx21 made $10,008 on $AAPL -Check it out! http://profit.ly/1MnD8s?aff=202 Learn #howtotrade http://bit.ly/1c1N"| __truncated__ "Insanity of today weirdo massive selling. $aapl bid up 45 cents after hours after non stop selling in trading hours" "S&P100 #Stocks Performance $HD $LOW $SBUX $TGT $DVN $IBM $AMZN $F $APA $GM $MS $HAL $DIS $MCD $BMY $XOM more@ "| __truncated__ "$GM $TSLA: Volkswagen Pushes 2014 Record Recall Tally Higher https://pic.twitter.com/WIIc1lW7hW @ProTradersNews"| __truncated__ ...
## $ comment_num: int 0 0 0 0 0 0 0 0 0 0 ...
## $ retweet_num: int 0 0 0 0 0 0 0 0 0 0 ...
## $ like_num : int 1 0 0 1 1 1 1 1 1 1 ...
## [1] "Date Range = min: 2015-01-01 max: 2019-12-31"
## [1] "number of distinct tweets = 3716836"
## [1] "number of NA or blanks in tweet_id = 0"
## [1] "number of NA or blanks in writer = 47273"
## [1] "number of NA in post_date = 0"
## [1] "number of NA or blanks in body = 0"
## [1] "number of NA or blanks in comment_num = 0"
## [1] "number of NA or blanks in retweet_num = 0"
## [1] "number of NA or blanks in like_num = 0"
## [1] "number of NA in post_datetime = 0"
First of all, the data starts from 2015-01-01 to 2019-12-31.
Therefore, I will use the CompanyValue in the same time frame only.
stk <- stk %>%
filter(between(day_date, as.Date("2015-01-01"), as.Date("2019-12-31")))
Secondly, there are more rows than the number of tweet_id. Which means, some tweet_id’s are being duplicated.
Let’s see if the table contain any duplicated rows.
twt <- unique(twt)
nrow(twt)
## [1] 3717964
n_distinct(twt$tweet_id)
## [1] 3716836
n_distinct(twt_to_cmpy$tweet_id)
## [1] 3716836
The data table does not have any duplicated rows.
Let’s find out what is causing the duplication.
twt_dup <- twt %>%
group_by(tweet_id) %>%
mutate(tweet_id_count = n()) %>%
filter(tweet_id_count > 1) %>%
mutate(tweet_id = format(tweet_id, scientific = FALSE)) %>%
arrange(desc(tweet_id_count), desc(tweet_id), desc(writer))
twt_dup %>%
filter(
retweet_num == max(twt_dup$retweet_num) |
comment_num == max(twt_dup$comment_num) |
like_num == max(twt_dup$like_num)
) %>%
select(tweet_id) %>%
unique() %>%
left_join(twt_dup, by = "tweet_id") %>%
arrange(desc(tweet_id), desc(writer)) %>%
gt() %>%
tab_header(
title = "Tweets with Max. Retweet, Comment and Like Values for the Duplicated Data"
)
| Tweets with Max. Retweet, Comment and Like Values for the Duplicated Data | |||||||
|---|---|---|---|---|---|---|---|
| writer | post_date | body | comment_num | retweet_num | like_num | post_datetime | tweet_id_count |
| 889589448706580480 | |||||||
| TheSquareMile | 2017-07-24 | #Alphabet $5.01 Q2 EPS tops Wall St's $4.46 view but stock dips. Profit -27.7% after $2.7bn EU fine. Revenue also beats, +21%; $26.01B $GOOG | 1 | 0 | 0 | 2017-07-24 20:53:47 | 2 |
| Ken_CityIndex | 2017-07-24 | #Alphabet $5.01 Q2 EPS tops Wall St's $4.46 view but stock dips. Profit -27.7% after $2.7bn EU fine. Revenue also beats, +21%; $26.01B $GOOG | 0 | 0 | 0 | 2017-07-24 20:53:47 | 2 |
| 791731855695437824 | |||||||
| CNBCnow | 2016-10-27 | EARNINGS: http://Amazon.com Q3 EPS $0.52 vs. $0.78 Est.; Q3 Revs. $32.71B vs. $32.69B Est. • $AMZN down 9% http://data.cnbc.com/quotes/amzn | 1 | 47 | 13 | 2016-10-27 20:02:38 | 2 |
| CNBC | 2016-10-27 | EARNINGS: http://Amazon.com Q3 EPS $0.52 vs. $0.78 Est.; Q3 Revs. $32.71B vs. $32.69B Est. • $AMZN down 9% http://data.cnbc.com/quotes/amzn | 1 | 19 | 5 | 2016-10-27 20:02:38 | 2 |
| 592753769231835136 | |||||||
| WSJMoneyBeat | 2015-04-27 | What to, you know <cough! cough!> *watch* for in Apple's earnings report. http://on.wsj.com/1JKWcYu via @WSJMoneyBeat $AAPL | 0 | 4 | 2 | 2015-04-27 18:14:46 | 2 |
| thecryptobook | 2015-04-27 | What to, you know <cough! cough!> *watch* for in Apple's earnings report. http://on.wsj.com/1JKWcYu via @WSJMoneyBeat $AAPL | 1 | 0 | 0 | 2015-04-27 18:14:46 | 2 |
| 590501113356365824 | |||||||
| HotpageNews | 2015-04-21 | #Apple , #Google helping terrorists with encryption – Manhattan DA $AAPL $GOOG http://hotpagenews.com/#101344 | 0 | 0 | 1 | 2015-04-21 13:03:31 | 2 |
| Hotpage_News | 2015-04-21 | #Apple , #Google helping terrorists with encryption – Manhattan DA $AAPL $GOOG http://hotpagenews.com/#101344 | 1 | 0 | 1 | 2015-04-21 13:03:31 | 2 |
| 1060342097457373184 | |||||||
| UnRealBonthu | 2018-11-08 | How is Jim Acosta’s press pass revocation is risk to $TSLA? | 1 | 0 | 1 | 2018-11-08 01:23:50 | 2 |
| antrum_alfred | 2018-11-08 | Tesla's Energy Storage Business Set To Come Into Its Own $TSLA#RenewableEnergy #Worldwide #CNN #FoxNews #ACN #financialfreedom | 0 | 0 | 1 | 2018-11-08 01:23:50 | 2 |
| 1021487171189055488 | |||||||
| FDRInvestments | 2018-07-23 | Alphabet $GOOG $GOOGL Tops Q2 EPS by $2.16 | 0 | 0 | 0 | 2018-07-23 20:08:14 | 2 |
| ERContrarian | 2018-07-23 | I’ll be cheering when $TSLA goes to zero. Elon’s behavior is disgusting. | 1 | 0 | 6 | 2018-07-23 20:08:14 | 2 |
twt_dup %>%
group_by(tweet_id) %>%
summarize(disct_body = n_distinct(body)) %>%
ggplot(aes(x = disct_body)) +
geom_bar(fill = "lightgray") +
geom_text(
stat = "count",
aes(label = format(after_stat(count), big.mark = ",")),
position = position_stack(vjust = 0.5)
) +
labs(
title = "Counting of Distinct Body Content Count per Tweet ID",
x = "Distinct Count of Body Content per Tweet ID",
y = "Tweet Counts"
) +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
twt_dup %>%
group_by(tweet_id) %>%
summarize(disct_writer = n_distinct(writer)) %>%
ggplot(aes(x = disct_writer)) +
geom_bar(fill = "lightgray") +
geom_text(
stat = "count",
aes(label = format(after_stat(count), big.mark = ",")),
position = position_stack(vjust = 0.5)
) +
labs(
title = "Counting of Distinct Writer Count per Tweet ID",
x = "Distinct Count of Writer per Tweet ID",
y = "Tweet Counts"
) +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
It seems like some duplicates are caused by the indistinct body contents, writer or both columns.
Testing some of these tweets (paste https://twitter.com/anyuser/status/tweet_id replacing tweet_id with one of the value), there wasn’t an indicator that dictates which body or writer is currently being used for that particular tweet_id.
Also, the comment_num, retweet_num, and like_num columns does not carry over between the duplicated tweet id.
Meaning, rows with same tweet_id doesn’t necessarily share the same value for the three columns.
I am going to find out if removing these data will be possible.
Because, even though these duplicates does not occur often, I would not like to double count the values when joining with the Company_Tweet data set.
I am going to analyze their proportion and will remove them if they take up less than 5% to the total.
summary(twt_dup)
twt_dup %>%
group_by(post_date) %>%
summarize(
dup_n = n_distinct(tweet_id),
dup_nrow = n()
) %>%
left_join(
twt %>%
group_by(post_date) %>%
summarize(
total_n = n_distinct(tweet_id),
total_nrow = n()
)
, by='post_date'
) %>%
mutate(
p_of_tot = sprintf("%.1f%%", dup_n * 100 / total_n),
p_of_tot_row = sprintf("%.1f%%", dup_nrow * 100 / total_nrow)
) %>%
arrange(desc(dup_n / total_n)) %>%
select(post_date, dup_n, total_n, p_of_tot, dup_nrow, total_nrow, p_of_tot_row) %>%
head(20) %>%
gt() %>%
tab_header(
title = "Proportion of Duplicated Data to the Total - Top 20"
) %>%
tab_spanner(
label = "by Tweets",
columns = c(dup_n, total_n, p_of_tot)
) %>%
tab_spanner(
label = "by Rows",
columns = c(dup_nrow, total_nrow, p_of_tot_row)
) %>%
cols_label(
dup_n = "Dup. Count",
total_n = "Total Count",
p_of_tot = "% of Total",
dup_nrow = "Dup. Count",
total_nrow = "Total Count",
p_of_tot_row = "% of Total",
post_date = "Post Date"
)
## tweet_id writer post_date body
## Length:2251 Length:2251 Min. :2015-01-28 Length:2251
## Class :character Class :character 1st Qu.:2016-04-23 Class :character
## Mode :character Mode :character Median :2016-07-18 Mode :character
## Mean :2016-07-18
## 3rd Qu.:2016-09-21
## Max. :2019-05-10
## comment_num retweet_num like_num
## Min. :0.00000 Min. : 0.00000 Min. : 0.00000
## 1st Qu.:0.00000 1st Qu.: 0.00000 1st Qu.: 0.00000
## Median :0.00000 Median : 0.00000 Median : 0.00000
## Mean :0.00311 Mean : 0.06797 Mean : 0.04131
## 3rd Qu.:0.00000 3rd Qu.: 0.00000 3rd Qu.: 0.00000
## Max. :1.00000 Max. :47.00000 Max. :13.00000
## post_datetime tweet_id_count
## Min. :2015-01-28 16:00:50 Min. :2.000
## 1st Qu.:2016-04-23 23:50:32 1st Qu.:2.000
## Median :2016-07-18 18:57:54 Median :2.000
## Mean :2016-07-18 14:50:52 Mean :2.007
## 3rd Qu.:2016-09-21 18:54:45 3rd Qu.:2.000
## Max. :2019-05-10 18:01:58 Max. :3.000
| Proportion of Duplicated Data to the Total - Top 20 | ||||||
|---|---|---|---|---|---|---|
| Post Date | by Tweets | by Rows | ||||
| Dup. Count | Total Count | % of Total | Dup. Count | Total Count | % of Total | |
| 2016-06-18 | 9 | 1459 | 0.6% | 18 | 1468 | 1.2% |
| 2016-06-11 | 5 | 885 | 0.6% | 10 | 890 | 1.1% |
| 2016-09-25 | 5 | 896 | 0.6% | 10 | 901 | 1.1% |
| 2016-03-21 | 31 | 6224 | 0.5% | 62 | 6255 | 1.0% |
| 2016-07-24 | 6 | 1215 | 0.5% | 12 | 1221 | 1.0% |
| 2016-06-14 | 24 | 5224 | 0.5% | 48 | 5248 | 0.9% |
| 2016-06-19 | 5 | 1172 | 0.4% | 10 | 1177 | 0.8% |
| 2016-09-09 | 26 | 6097 | 0.4% | 53 | 6124 | 0.9% |
| 2016-07-03 | 3 | 725 | 0.4% | 6 | 728 | 0.8% |
| 2016-10-02 | 4 | 1023 | 0.4% | 8 | 1027 | 0.8% |
| 2016-09-18 | 4 | 1028 | 0.4% | 8 | 1032 | 0.8% |
| 2016-03-20 | 3 | 774 | 0.4% | 6 | 777 | 0.8% |
| 2016-09-11 | 4 | 1149 | 0.3% | 8 | 1153 | 0.7% |
| 2016-05-21 | 4 | 1157 | 0.3% | 8 | 1161 | 0.7% |
| 2016-03-05 | 4 | 1164 | 0.3% | 8 | 1168 | 0.7% |
| 2016-09-17 | 4 | 1246 | 0.3% | 8 | 1250 | 0.6% |
| 2016-04-17 | 2 | 634 | 0.3% | 4 | 636 | 0.6% |
| 2016-03-03 | 9 | 2910 | 0.3% | 18 | 2919 | 0.6% |
| 2016-05-25 | 10 | 3249 | 0.3% | 20 | 3259 | 0.6% |
| 2016-08-28 | 3 | 1006 | 0.3% | 6 | 1009 | 0.6% |
The result above show that highest % total of the duplicated by post date is less than 1.2% in terms of rows and 0.6% in terms of tweet_id.
Thus, if the duplicated data is deleted, it will only affect less than 1.2% of data each day.
twt_dup_sum <- twt_dup %>%
group_by(post_date) %>%
summarize(
dup_com_tot = sum(comment_num),
dup_ret_tot = sum(retweet_num),
dup_lik_tot = sum(like_num)
) %>%
left_join(
twt %>%
group_by(post_date) %>%
summarize(
com_tot = sum(comment_num),
ret_tot = sum(retweet_num),
lik_tot = sum(like_num)
)
, by="post_date"
) %>%
mutate(
com_rate = dup_com_tot/com_tot,
ret_rate = dup_ret_tot/ret_tot,
lik_rate = dup_lik_tot/lik_tot
)
twt_dup_sum %>%
arrange(desc(dup_com_tot + dup_ret_tot + dup_lik_tot)) %>%
mutate(
p_of_com_tot = sprintf('%.2f%%', com_rate * 100),
p_of_ret_tot = sprintf('%.2f%%', ret_rate * 100),
p_of_lik_tot = sprintf('%.2f%%', lik_rate * 100)
) %>%
select(
post_date,
dup_com_tot, com_tot, p_of_com_tot,
dup_ret_tot, ret_tot, p_of_ret_tot,
dup_lik_tot, lik_tot, p_of_lik_tot
) %>%
head(20) %>%
gt() %>%
tab_header(
title = "Proportion of Comment, Like, Retweet Count of the Duplicated Data to the Total - Top 20"
) %>%
tab_spanner(
label = "Comments",
columns = c(dup_com_tot, com_tot, p_of_com_tot)
) %>%
tab_spanner(
label = "Likes",
columns = c(dup_lik_tot, lik_tot, p_of_lik_tot)
) %>%
tab_spanner(
label = "Retweets",
columns = c(dup_ret_tot, ret_tot, p_of_ret_tot)
) %>%
cols_label(
post_date = "Post Date",
dup_com_tot = "Dup. Count", com_tot = "Total Count", p_of_com_tot = "% of Total",
dup_ret_tot = "Dup. Count", ret_tot = "Total Count", p_of_ret_tot = "% of Total",
dup_lik_tot = "Dup. Count", lik_tot = "Total Count", p_of_lik_tot = "% of Total"
)
| Proportion of Comment, Like, Retweet Count of the Duplicated Data to the Total - Top 20 | |||||||||
|---|---|---|---|---|---|---|---|---|---|
| Post Date | Comments | Retweets | Likes | ||||||
| Dup. Count | Total Count | % of Total | Dup. Count | Total Count | % of Total | Dup. Count | Total Count | % of Total | |
| 2016-10-27 | 2 | 617 | 0.32% | 66 | 2515 | 2.62% | 18 | 4058 | 0.44% |
| 2015-02-27 | 0 | 285 | 0.00% | 9 | 869 | 1.04% | 1 | 1026 | 0.10% |
| 2015-04-27 | 1 | 1636 | 0.06% | 4 | 5286 | 0.08% | 2 | 5366 | 0.04% |
| 2018-07-23 | 1 | 2385 | 0.04% | 0 | 3838 | 0.00% | 6 | 17247 | 0.03% |
| 2016-07-05 | 0 | 142 | 0.00% | 4 | 891 | 0.45% | 2 | 1062 | 0.19% |
| 2016-04-03 | 0 | 105 | 0.00% | 4 | 365 | 1.10% | 0 | 590 | 0.00% |
| 2016-07-07 | 0 | 153 | 0.00% | 2 | 638 | 0.31% | 2 | 1010 | 0.20% |
| 2015-01-28 | 0 | 990 | 0.00% | 3 | 5203 | 0.06% | 0 | 3646 | 0.00% |
| 2015-04-21 | 1 | 274 | 0.36% | 0 | 707 | 0.00% | 2 | 861 | 0.23% |
| 2016-06-03 | 0 | 103 | 0.00% | 2 | 564 | 0.35% | 1 | 737 | 0.14% |
| 2016-07-14 | 0 | 137 | 0.00% | 2 | 878 | 0.23% | 1 | 1004 | 0.10% |
| 2016-09-25 | 0 | 52 | 0.00% | 1 | 440 | 0.23% | 2 | 569 | 0.35% |
| 2016-09-29 | 0 | 281 | 0.00% | 2 | 639 | 0.31% | 1 | 1114 | 0.09% |
| 2016-10-05 | 0 | 242 | 0.00% | 2 | 1307 | 0.15% | 1 | 1594 | 0.06% |
| 2017-01-20 | 0 | 120 | 0.00% | 1 | 713 | 0.14% | 2 | 1024 | 0.20% |
| 2017-03-08 | 0 | 145 | 0.00% | 2 | 674 | 0.30% | 1 | 1384 | 0.07% |
| 2017-05-15 | 0 | 308 | 0.00% | 2 | 1531 | 0.13% | 1 | 3109 | 0.03% |
| 2018-11-08 | 1 | 1223 | 0.08% | 0 | 1606 | 0.00% | 2 | 9448 | 0.02% |
| 2016-02-27 | 0 | 52 | 0.00% | 2 | 244 | 0.82% | 0 | 350 | 0.00% |
| 2016-03-20 | 0 | 67 | 0.00% | 1 | 280 | 0.36% | 1 | 339 | 0.29% |
This shows the total values of the comment, retweet and like number for both duplicated and original data set, and the % of total represents the proportion of the duplicated values to the total.
twt_dup_sum %>%
ungroup() %>%
summarize(
"max_com_%_of_tot" = max(com_rate),
"max_ret_%_of_tot" = max(ret_rate),
"max_lik_%_of_tot" = max(lik_rate)
) %>%
mutate_all(funs(sprintf("%.2f%%", . * 100)))
## # A tibble: 1 × 3
## `max_com_%_of_tot` `max_ret_%_of_tot` `max_lik_%_of_tot`
## <chr> <chr> <chr>
## 1 0.36% 2.62% 0.44%
This table shows the maximum value of the % total for the three values.
It means that, when the duplicated values are deleted, it will affect less than the percentages above for the corresponding columns on the days that contains the duplicates.
Since it seems like the impact would be less than 5% for each days, I believe it would be best to work without the duplicated rows.
twt <- twt %>%
left_join(
twt_dup %>%
select(tweet_id) %>%
unique() %>%
mutate(
mark = 1,
tweet_id=as.numeric(tweet_id)
)
, by = "tweet_id"
) %>%
filter(is.na(mark) == TRUE) %>%
select(-mark)
Finally, let’s see why some rows have blank writer column.
twt %>%
mutate(tweet_id = format(tweet_id, scientific = FALSE)) %>%
filter(writer == "") %>%
head(10)
## tweet_id writer post_date
## 1 550698244552925184 2015-01-01
## 2 551029459957325824 2015-01-02
## 3 551102918204874752 2015-01-02
## 4 551123120783122432 2015-01-02
## 5 551144074716397568 2015-01-02
## 6 551774355768610816 2015-01-04
## 7 551839407792463872 2015-01-04
## 8 551843305051676672 2015-01-04
## 9 552136823045357568 2015-01-05
## 10 552143317233053696 2015-01-05
## body
## 1 2015 technology forecasts: Wearable technology innovation > wearable sales $AAPL $T $VZ $MSFT $AMZN $NFLX http://dlvr.it/7zmKxy
## 2 Today in Twitter: an aggravated VC, word parsing, and $AAPL. 2015 off to a phenomenal start.
## 3 $AAPL LT view. This could be interesting. If it dips below 100, I'll be adding
## 4 $AAPL So, I didn't buy any today. It was hardly down at all really.
## 5 maybe $TSLA is selling off because Elon is getting divorced again?
## 6 5 IBD 50 Stocks Poised For Strong '15 Earnings Gains $AMBA $GPRO $BIDU $FB $AAPL $AVGO http://dlvr.it/80ytps
## 7 Nice webinar this morning @omillionaires , as usual. Thanks! http://optionmillionaires.com/category/general-option-tips/… #stocks $SSYS $TSLA $NFLX
## 8 @jonfortt Shhhhhhhhhhh, the bears don't need anymore excuses to push down $AAPL ;)
## 9 $AMZN reminder of our Mega Bear Call on Amazon - Inv HVF, down from 311 to 303 chart http://ow.ly/i/89yoL Vid https://youtube.com/watch?v=xO68ZVhbB5o…
## 10 Despite 10's of mil $, $GOOG "self driving" cars can't avoid unforeseen, moving obstacles. $GOSY #robots can: http://tinyurl.com/le8a39r
## comment_num retweet_num like_num post_datetime
## 1 0 0 2 2015-01-01 17:01:07
## 2 2 1 5 2015-01-02 14:57:15
## 3 1 0 0 2015-01-02 19:49:09
## 4 0 0 0 2015-01-02 21:09:26
## 5 0 0 0 2015-01-02 22:32:41
## 6 0 0 2 2015-01-04 16:17:12
## 7 0 0 0 2015-01-04 20:35:42
## 8 1 1 0 2015-01-04 20:51:11
## 9 0 0 0 2015-01-05 16:17:31
## 10 0 0 0 2015-01-05 16:43:19
I originally thought the absence of writer was the result of a deleted tweets. But after testing a few, that seems to be false.
However, it might be a good idea to replace the blank value with a unique name to prevent it from being treated as a single writer.
twt <- twt %>%
left_join(
twt %>%
filter(writer == "") %>%
select(tweet_id) %>%
mutate(blank_writer = paste("blank_", row_number(), sep = "")) #naming blank writer as "blank_rownumber"
, by = "tweet_id"
) %>%
mutate(writer = paste(writer, ifelse(is.na(blank_writer), "", blank_writer), sep = "")) %>%
select(-blank_writer)
Now that the table has been cleaned, I’d like to see how distributed “comment_num”,“retweet_num” and “like_num” values are.
ggplot(twt, aes(x = comment_num)) +
geom_histogram(binwidth = 20, color = "gray", fill = "gray") +
stat_bin(
binwidth = 20,
geom = "text",
aes(label = format(..count.., big.mark=",")),
angle = 75,
size = 3,
hjust = 0,
vjust = 0
) +
scale_y_continuous(limits = c(0,4300000), labels = function(x) format(x, big.mark = ",", scientific = FALSE)) +
labs(
title = "Distribution of comment_num",
y = "Tweet Count",
x = "comment_num"
) +
theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold"))
ggplot(twt, aes(x = retweet_num)) +
geom_histogram(binwidth = 20, color = "gray", fill = "gray") +
stat_bin(
binwidth = 20,
geom = "text",
aes(label = format(..count.., big.mark = ",")),
angle = 75,
size = 3,
hjust = 0,
vjust = 0
) +
scale_y_continuous(limits = c(0,4300000), labels = function(x) format(x, big.mark = ",", scientific = FALSE)) +
labs(
title = "Distribution of retweet_num",
x = "retweet_num",
y = "Tweet Count") +
theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold"))
ggplot(twt, aes(x = like_num)) +
geom_histogram(binwidth = 20, color = "gray", fill = "gray") +
stat_bin(
binwidth = 20,
geom = "text",
aes(label = format(..count.., big.mark = ",")),
angle = 75,
size = 3,
hjust = 0,
vjust = 0
) +
scale_y_continuous(limits = c(0,4300000), labels = function(x) format(x, big.mark = ",", scientific = FALSE)) +
labs(
title = "Distribution of like_num",
x = "like_num",
y = "Tweet Count"
) +
theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold"))
So, most of the tweets don’t have any comments, likes, or haven’t been retweeted.
Let’s see the ratio of 0 to non-zero values of these three columns.
twt <- twt %>%
mutate(
yesno_comment = if_else(comment_num == 0, 0, 1),
yesno_retweet = if_else(retweet_num == 0, 0, 1),
yesno_like = if_else(like_num == 0, 0, 1)
)
twt %>%
select(yesno_comment, yesno_retweet, yesno_like) %>%
rename(Comment = yesno_comment, Retweet = yesno_retweet, Like = yesno_like) %>%
gather(key = type,value = yesno_value) %>%
group_by(type, yesno_value) %>%
summarize(p = sprintf("%.1f%%", n() * 100 / nrow(twt)), .groups = "drop") %>%
pivot_wider(id_cols = "yesno_value", names_from = "type", values_from = p) %>%
mutate(yesno_value = recode(yesno_value, "0" = "No Value", "1" = "Yes Value")) %>%
gt(rowname_col = "yesno_value") %>%
tab_header(
title = "Proportion of Tweets with Comment, Likes, or Retweets"
)
| Proportion of Tweets with Comment, Likes, or Retweets | |||
|---|---|---|---|
| Comment | Like | Retweet | |
| No Value | 85.6% | 66.8% | 83.3% |
| Yes Value | 14.4% | 33.2% | 16.7% |
From this result, we can see that 85.6% of the whole tweets do not have any comments, 66.8% don’t have likes and 83.3% haven’t been retweeted.
Now, I’d like to do some analysis on the writers.
Knowing that there are 0 writers within the data table with 3715713 rows, we can reasonably assume some of them posted multiple tweets.
Let’s see the distribution of posted/retweeted tweet counts and the total like/comment counts among these writers.
twt_w <- twt %>%
group_by (writer) %>%
summarize (
tweet_ct = n_distinct(tweet_id),
comment_num = sum(comment_num),
retweet_num = sum(retweet_num),
like_num = sum(like_num),
tweet_ct_m = ifelse(n_distinct(tweet_id) > 1 , 1, 0),
.groups = "drop"
) %>%
mutate (
p_tweet_ct = tweet_ct / sum(tweet_ct),
p_comment_num = comment_num / sum(comment_num),
p_like_num = like_num / sum(like_num),
p_retweet_num = retweet_num / sum(retweet_num)
)
twt_w %>%
arrange(desc(tweet_ct)) %>%
head(nrow(twt_w) / 100) %>%
mutate(
tweet_ct = sapply(tweet_ct, function(x) {format(x, big.mark = ",")}),
percent_of_total = sprintf("%.1f%%", p_tweet_ct * 100)
) %>%
head(20) %>%
select(writer, tweet_ct, percent_of_total) %>%
gt() %>%
cols_label(
writer = "Writer",
tweet_ct = "Tweet Count",
percent_of_total = "% of Total"
) %>%
tab_header(
title = "Writer with the Most Posted Tweets - Top 20"
)
| Writer with the Most Posted Tweets - Top 20 | ||
|---|---|---|
| Writer | Tweet Count | % of Total |
| App_sw_ | 91,545 | 2.5% |
| _peripherals | 90,776 | 2.4% |
| computer_hware | 90,184 | 2.4% |
| It_c0nsulting | 74,970 | 2.0% |
| PortfolioBuzz | 59,837 | 1.6% |
| retail_Dbt | 44,052 | 1.2% |
| MacHashNews | 42,190 | 1.1% |
| ExactOptionPick | 22,084 | 0.6% |
| markbspiegel | 18,228 | 0.5% |
| davidmoadel | 15,553 | 0.4% |
| OACtrading | 14,432 | 0.4% |
| SeekingAlpha | 13,970 | 0.4% |
| treabase | 13,828 | 0.4% |
| rosnerstocks | 13,460 | 0.4% |
| PolgarStocks | 13,369 | 0.4% |
| Commuternyc | 13,241 | 0.4% |
| IHNewsDesk | 13,226 | 0.4% |
| JohnyTradr | 12,419 | 0.3% |
| TalkMarkets | 11,685 | 0.3% |
| Benzinga | 10,791 | 0.3% |
twt_w %>%
arrange(desc(comment_num)) %>%
head(nrow(twt_w) / 100) %>%
mutate(
comment_num = sapply(comment_num , function(x) {format(x, big.mark = ",")}),
percent_of_total = sprintf("%.1f%%", p_comment_num * 100)
)%>%
head(20) %>%
select(writer, comment_num, percent_of_total) %>%
gt() %>%
cols_label(
writer = "Writer",
comment_num = "Total Comments",
percent_of_total = "% of Total"
) %>%
tab_header(
title = "Writer with the Most Total Comments - Top 20"
)
| Writer with the Most Total Comments - Top 20 | ||
|---|---|---|
| Writer | Total Comments | % of Total |
| GerberKawasaki | 44,755 | 3.9% |
| markbspiegel | 34,211 | 2.9% |
| TESLAcharts | 32,638 | 2.8% |
| vincent13031925 | 18,362 | 1.6% |
| PlugInFUD | 17,066 | 1.5% |
| option_snipper | 11,937 | 1.0% |
| ValueAnalyst1 | 11,452 | 1.0% |
| QTRResearch | 11,228 | 1.0% |
| Sandro_power | 11,023 | 0.9% |
| orthereaboot | 8,927 | 0.8% |
| passthebeano | 8,557 | 0.7% |
| WallStCynic | 8,195 | 0.7% |
| jimcramer | 7,609 | 0.7% |
| Polixenes13 | 7,449 | 0.6% |
| 28delayslater | 7,293 | 0.6% |
| zomgapocalypse | 6,425 | 0.6% |
| RampCapitalLLC | 6,002 | 0.5% |
| Commuternyc | 5,540 | 0.5% |
| WPipperger | 5,523 | 0.5% |
| Gfilche | 5,322 | 0.5% |
twt_w %>%
arrange(desc(like_num)) %>%
head(nrow(twt_w) / 100)%>%
mutate(
like_num = sapply(like_num, function(x) {format(x, big.mark = ",")}),
percent_of_total = sprintf("%.1f%%", p_like_num * 100)
)%>%
head(20) %>%
select(writer, like_num, percent_of_total) %>%
gt() %>%
cols_label(
writer = "Writer",
like_num = "Total Likes",
percent_of_total = "% of Total"
) %>%
tab_header(
title = "Writer with the Most Total Likes - Top 20"
)
| Writer with the Most Total Likes - Top 20 | ||
|---|---|---|
| Writer | Total Likes | % of Total |
| vincent13031925 | 325,683 | 3.9% |
| GerberKawasaki | 313,577 | 3.8% |
| TESLAcharts | 268,907 | 3.3% |
| markbspiegel | 166,534 | 2.0% |
| PlugInFUD | 100,837 | 1.2% |
| orthereaboot | 95,515 | 1.2% |
| option_snipper | 94,449 | 1.1% |
| ValueAnalyst1 | 81,233 | 1.0% |
| Polixenes13 | 81,087 | 1.0% |
| QTRResearch | 73,779 | 0.9% |
| Teslarati | 68,376 | 0.8% |
| 28delayslater | 68,213 | 0.8% |
| passthebeano | 66,485 | 0.8% |
| WallStCynic | 64,898 | 0.8% |
| TeslaOpinion | 57,832 | 0.7% |
| ElonBachman | 56,126 | 0.7% |
| Stocktwits | 54,309 | 0.7% |
| evannex_com | 53,923 | 0.7% |
| BarkMSmeagol | 53,860 | 0.7% |
| Sandro_power | 52,338 | 0.6% |
twt_w %>%
arrange(desc(retweet_num)) %>%
head(nrow(twt_w) / 100)%>%
mutate(
retweet_num = sapply(retweet_num , function(x) {format(x, big.mark = ",")}),
percent_of_total = sprintf("%.1f%%", p_retweet_num * 100)
)%>%
head(20) %>%
select(writer, retweet_num, percent_of_total) %>%
gt() %>%
cols_label(
writer = "Writer",
retweet_num = "Total Retweets",
percent_of_total = "% of Total"
) %>%
tab_header(
title = "Writer with the Most Total Retweets - Top 20"
)
| Writer with the Most Total Retweets - Top 20 | ||
|---|---|---|
| Writer | Total Retweets | % of Total |
| philstockworld | 74,661 | 3.2% |
| vincent13031925 | 57,553 | 2.5% |
| GerberKawasaki | 47,117 | 2.0% |
| Stocktwits | 42,158 | 1.8% |
| Sandro_power | 41,545 | 1.8% |
| TESLAcharts | 27,580 | 1.2% |
| WSJ | 24,498 | 1.1% |
| YahooFinance | 23,918 | 1.0% |
| markbspiegel | 22,943 | 1.0% |
| charliebilello | 22,067 | 1.0% |
| businessinsider | 20,325 | 0.9% |
| Reuters | 18,696 | 0.8% |
| TheStreet | 17,758 | 0.8% |
| ReutersBiz | 17,024 | 0.7% |
| evannex_com | 16,959 | 0.7% |
| CNNBusiness | 14,863 | 0.6% |
| OphirGottlieb | 14,512 | 0.6% |
| financialbuzz | 14,443 | 0.6% |
| TeslaNY | 13,810 | 0.6% |
| QTRResearch | 13,542 | 0.6% |
Indeed, above result confirms tweet counts, comment num, like num, and retweet num are not equally distributed among the writers.
According to the above code, the top 1% makes up 71.9% of total tweets, 82.5% of total comments, 86.6% of total likes and 87.3% of total retweets.
twt_w %>%
group_by(tweet_ct_m) %>%
summarize(n = n()) %>%
mutate(n = format(n, big.mark = ",")) %>%
spread(key = tweet_ct_m, value = n) %>%
gt() %>%
cols_label(
"0" = "Irregular User",
"1" = "Regular User"
) %>%
tab_header(
title = "Number of Writers by Activity Level",
subtitle = "1 := Posted Tweets > 1, 0 := Posted Tweets == 1"
)
| Number of Writers by Activity Level | |
|---|---|
| 1 := Posted Tweets > 1, 0 := Posted Tweets == 1 | |
| Irregular User | Regular User |
| 124,850 | 62,552 |
And out of 187,402 writers, 62,552 users posted more than one tweets while 124,850 users only posted a single tweet.
Based on this result, I will create a column called “reg_user” where 0 means the writer only posted one tweet and 1 means the opposite.
twt <- twt %>%
left_join(
twt %>%
group_by(writer) %>%
summarize(n = n())
, by = "writer"
) %>%
mutate(reg_user = ifelse(n > 1, 1, 0)) %>%
select(-n)
Let’s take a look at distribution of tweets over time.
twt_d <- twt %>%
mutate(post_date_x = as.Date(format(post_date, "%Y-%m-01"))) %>%
group_by(post_date_x) %>%
summarize(
writer_ct = n_distinct(writer),
tweet_ct = n_distinct(tweet_id),
yescomment_ct = length(yesno_comment[yesno_comment == 1]),
yesretweet_ct = length(yesno_retweet[yesno_retweet == 1]),
yeslike_ct = length(yesno_like[yesno_like == 1])
)
colors <- c("Have Comments" = "#C0392B", "Have Likes" = "#2980B9", "Retweeted" = "#17A589")
ggplot(twt_d, aes(x = post_date_x)) +
geom_col(aes(y = tweet_ct, fill = "Total Tweets"), color = "lightgrey") +
#lines
geom_line(aes(y = yescomment_ct, color = "Have Comments"), size = 1) +
geom_line(aes(y = yesretweet_ct, color="Retweeted") , size = 1) +
geom_line(aes(y = yeslike_ct, color = "Have Likes"), size = 1) +
#trendline
geom_smooth(aes(y = tweet_ct, color = "black"), method = "lm", se = FALSE, linetype = "dashed", size = 0.5) +
geom_smooth(aes(y = yescomment_ct, color = "Have Comments"), method = "lm", se = FALSE, linetype = "dashed", size = 0.5) +
geom_smooth(aes(y = yesretweet_ct, color = "Retweeted"), method = "lm", se = FALSE, linetype = "dashed", size = 0.5) +
geom_smooth(aes(y = yeslike_ct, color = "Have Likes"), method = "lm", se = FALSE, linetype = "dashed", size = 0.5) +
#r2
stat_regline_equation(
aes(y = tweet_ct, label = ..eq.label..),
label.x = 17000, label.y = 67000, size = 3.5
) +
stat_regline_equation(
aes(y = yescomment_ct, label = ..eq.label.., color = "Have Comments"),
label.x = 16900, label.y=-100, size=3.5, show.legend=FALSE
) +
stat_regline_equation(
aes(y = yesretweet_ct, label = ..eq.label.., color = "Retweeted"),
label.x = 17700, label.y = 7000, size = 3.5, show.legend = FALSE
) +
stat_regline_equation(
aes(y = yeslike_ct, label = ..eq.label.., color = "Have Likes"),
label.x = 17100, label.y = 25000, size = 3.5, show.legend = FALSE
) +
labs(
title = "Distribution of Tweets over Time",
y = "Number of Tweets",
x = "Year Month",
fill = "Legend",
color = NULL
) +
scale_fill_manual(values = c("Total Tweets" = "white")) +
scale_color_manual(values = colors) +
scale_y_continuous(labels = function(x) {format(x, big.mark = ",", scientific = FALSE)}) +
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5))
Above shows the monthly summary of the total tweets, each line representing the total counts of the ones with comments, likes or those that have been retweeted.
While the number of posted tweets stay consistent over the years, it can be observed that they are more liked, commented and retweeted after 2018 Jan.
This could signify that the general population’s interest in the stock market has started to grow since that time.
This could be a natural outcome since it is said that 2018 was one of the worst years for stock market after the market crash in 2008.
Let’s see how the distinct number of writers change over time compared to the total tweet counts.
coef <- mean(twt_d$tweet_ct) / mean(twt_d$writer_ct)
twt_d %>%
ggplot(aes(x = post_date_x)) +
geom_line(aes(y = tweet_ct, color = "Tweet"), size = 0.8) +
geom_line(aes(y = writer_ct * coef, color = "Total Writer"), size = 0.8) +
scale_y_continuous(
labels = function(x) {format(x, big.mark = ",", scientific = FALSE)},
sec.axis = sec_axis(
~ . / coef,
name = "Writer Count",
labels = function(x) {format(x, big.mark = ",", scientific = FALSE)}
)
) +
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5)) +
labs(
title = "Tweet Count VS. Writer Count",
x = "Year Month",
y = "Tweet Count",
color = "Legend"
)
According to the result above, there were less tweet counts in 2017 than 2016 even thought there are more writers comparatively.
I’d like to see if this is caused by a handful of users’ unusual active posting.
I will define a column “top_10” based on each users’ total tweet count.
twt <- twt %>%
left_join(
twt %>%
group_by(writer) %>%
summarize(n = n_distinct(tweet_id)) %>%
arrange(desc(n)) %>%
head(10) %>%
select(writer) %>%
mutate(top_10 = 1) %>%
{t10 <<- .}
, by = "writer"
) %>%
mutate(
top_10 = ifelse(is.na(top_10),0,1)
)
twt %>%
mutate(
writer_10 = ifelse(top_10 == 1, writer, NA),
post_date_x = as.Date(format(post_date, "%Y-%m-01"))
) %>%
group_by(writer_10, post_date_x) %>%
summarize(tweet_ct = n_distinct(tweet_id), .groups="drop") %>%
ggplot(aes(x = post_date_x)) +
geom_bar(
aes(fill = writer_10, y = tweet_ct),
position = position_stack(reverse = TRUE), stat = "identity"
) +
scale_fill_discrete(name = "Top 10 Users", labels = c(sort(t10$writer), "Not Top 10")) +
scale_y_continuous(labels = function(x) {format(x, big.mark=",")}) +
scale_x_date(date_breaks = "6 months", date_labels = "%y\n%b", limits = c(as.Date("2014-11-01"), NA)) +
labs(
title = "Tweet Count by Top 10 Users",
x = "Year Month",
y = "Tweet Count"
) +
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))
Indeed, the Top 10 user posted unusual amount of tweets between Jan 2016 - Sept 2017.
I will combine the Company Tweet Dataset with the Tweet Dataset and continue with the analysis.
Let’s first look into the distribution of tweets over time for each companies.
twt_n_cmpy <- twt %>%
left_join(twt_to_cmpy, by = "tweet_id")
facet_label<-c(
"Z" = "Total",
"AAPL" = "AAPL",
"GOOG" = "GOOG",
"GOOGL" = "GOOGL",
"AMZN" = "AMZN",
"MSFT" = "MSFT",
"TSLA" = "TSLA"
)
twt_n_cmpy %>%
mutate(post_date_x = as.Date(format(post_date, "%Y-%m-01"))) %>%
group_by(post_date_x, ticker_symbol) %>%
summarize(
tweet_ct = n_distinct(tweet_id),
yescomment_ct = length(yesno_comment[yesno_comment == 1]),
yesretweet_ct = length(yesno_retweet[yesno_retweet == 1]),
yeslike_ct = length(yesno_like[yesno_like == 1]),
.groups = "drop"
) %>%
rbind(
# to have "total" facet_grid
twt_n_cmpy %>%
mutate(post_date_x = as.Date(format(post_date, "%Y-%m-01")),
ticker_symbol = "Z") %>% #named "Z" so that it can be placed on the bottom
group_by(post_date_x, ticker_symbol) %>%
summarize(
tweet_ct = n_distinct(tweet_id),
yescomment_ct = length(yesno_comment[yesno_comment == 1]),
yesretweet_ct = length(yesno_retweet[yesno_retweet == 1]),
yeslike_ct = length(yesno_like[yesno_like == 1]),
.groups = "drop"
)
) %>%
ggplot(aes(x = post_date_x, group = ticker_symbol)) +
geom_col(aes(y = tweet_ct, fill = "Total Tweets"), color = "lightgrey") +
#lines
geom_line(aes(y = yescomment_ct, color = "Have Comments"), size = 1) +
geom_line(aes(y = yesretweet_ct, color = "Retweeted") , size = 1) +
geom_line(aes(y = yeslike_ct, color = "Have Likes"), size = 1) +
#trendline
geom_smooth(aes(y = tweet_ct, color = "black"), method = "lm", se = FALSE, linetype = "dashed", size=0.5) +
geom_smooth(aes(y = yescomment_ct, color = "Have Comments"), method = "lm", se = FALSE, linetype = "dashed", size = 0.5) +
geom_smooth(aes(y = yesretweet_ct, color = "Retweeted"), method = "lm", se = FALSE, linetype = "dashed", size = 0.5) +
geom_smooth(aes(y = yeslike_ct, color = "Have Likes"), method = "lm", se = FALSE, linetype = "dashed", size = 0.5) +
#r2
stat_regline_equation(
aes(y = tweet_ct, label = ..eq.label..),
label.x.npc = 0.90, label.y.npc = 1, size = 3.5
) +
stat_regline_equation(
aes(y = yeslike_ct, label = ..eq.label.., color = "Have Likes"),
label.x.npc = 0.90, label.y.npc = 0.95, size = 3.5, show.legend = FALSE
) +
stat_regline_equation(
aes(y = yesretweet_ct, label = ..eq.label.., color = "Retweeted"),
label.x.npc = 0.90, label.y.npc = 0.9, size = 3.5, show.legend = FALSE
) +
stat_regline_equation(
aes(y = yescomment_ct, label = ..eq.label.., color = "Have Comments"),
label.x.npc = 0.90, label.y.npc = 0.85, size = 3.5, show.legend = FALSE
) +
facet_grid(ticker_symbol ~ ., scales = "free", labeller = function(variable, value) {return(facet_label[value])}) +
labs(
title = "Distribution of Tweets over Time",
y = "Number of Tweets",
x = "Year Month",
fill = "Legend",
color = NULL
) +
scale_fill_manual(values = c("Total Tweets" = "white")) +
scale_color_manual(values = colors) +
scale_y_continuous(labels = function(x) {format(x, big.mark = ",", scientific = FALSE)})+
theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5))
It is interesting to note that AMZN and TSLA are the only ones with positive growth in tweet counts over and they both had an exponential growth in stock prices.
I’d also like to check the proportion of the top 10 users for each companies’ total tweets.
twt_n_cmpy %>%
rbind(
twt_n_cmpy %>%
mutate(ticker_symbol = "Z")
) %>%
mutate(
writer_10 = ifelse(top_10 == 1, writer, NA),
post_date_x = as.Date(format(post_date, "%Y-%m-01"))
) %>%
group_by(writer_10, post_date_x, ticker_symbol) %>%
summarize(tweet_ct = n_distinct(tweet_id), .groups="drop") %>%
ggplot(aes(x = post_date_x)) +
geom_bar(
aes(fill = writer_10, y = tweet_ct),
position = position_stack(reverse = TRUE), stat = "identity"
) +
scale_fill_discrete(name = "Top 10 Users", labels = c(sort(t10$writer), "Not Top 10")) +
scale_y_continuous(labels = function(x) {format(x, big.mark=",")}) +
scale_x_date(date_breaks = "6 months", date_labels = "%y\n%b", limits = c(as.Date("2014-11-01"), NA)) +
labs(
title = "Tweet Count by Top 10 Users",
x = "Year Month",
y = "Tweet Count"
) +
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5)) +
facet_grid(ticker_symbol ~ ., scales = "free", labeller = function(variable, value) {return(facet_label[value])})
Every companies except GOOGL and TSLA are afflicted by the top 10 users.
For some month, they represents more than 50% of the uploaded tweets.
I will now test if there are any correlation between tweet counts against close value and traded volume.
And I will see if excluding the top 10 users affect the correlation.
m_twt <- twt_n_cmpy %>%
mutate(top_10 = 1) %>%
rbind(
twt_n_cmpy %>%
filter(top_10 == 0) %>%
mutate(top_10 = 0)
) %>%
group_by(post_date, ticker_symbol, top_10) %>%
summarize(
tweet_ct = n_distinct(tweet_id),
writer_ct = n_distinct(writer),
.groups = "drop"
) %>%
left_join(
stk, by = c("post_date" = "day_date", "ticker_symbol" = "ticker_symbol")
)
m_twt %>%
mutate(top_10 = ifelse(top_10 == 1, "With Top 10", "Without Top 10")) %>%
group_by(ticker_symbol) %>%
mutate(cval_c = sum(tweet_ct) / sum(close_value)) %>%
ggplot(aes(x = post_date)) +
geom_line(aes(y = tweet_ct, color = "Tweet Count")) +
geom_line(aes(y = close_value * cval_c, color = "Close Value $")) +
#max close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter(close_value == max(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, #label y location
label = paste("$", format(close_value, big.mark = ",", scientific = FALSE), sep = "")),
hjust = 0.7, vjust = -0.2,
label.size = NA,
alpha = 0.5
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (close_value == max(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, color = "Max Close Value $")
) +
#min close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter(close_value == min(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, #label y location
label = paste("$", format(close_value, big.mark = ",", scientific = FALSE), sep = "")),
hjust = 0.3, vjust = -0.2,
label.size = NA,
alpha = 0.5
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (close_value == min(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, color = "Min Close Value $")
) +
scale_y_continuous(labels=function(x) {format(x,big.mark = ",", scientific = FALSE)}) +
scale_color_manual(values = c("Tweet Count" = "dodgerblue", "Close Value $" = "darkgreen", "Max Close Value $" = "red", "Min Close Value $" = "blue")) +
facet_grid(rows = vars(ticker_symbol), cols = vars(top_10), scales = "free") +
labs(
title = "Tweet Counts VS Close Value",
x = "Year Month",
y = "Tweet Counts",
color = "Legend"
) +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
legend.position = "top"
)
c_color <- function(x) {
neg <- scales::col_numeric(
palette = c("#E74C3C", "#FDEDEC"),
domain = c(-1, -0.3),
na.color = "white"
)
pos <- scales::col_numeric(
palette = c("#E9F7EF","#27AE60"),
domain = c(0.3, 1),
na.color = "white"
)
ifelse(x>0, pos(x), neg(x))
}
m_twt %>%
group_by(ticker_symbol, top_10) %>%
summarize(
correlation = cor.test(tweet_ct, close_value)$estimate,
p_value = cor.test(tweet_ct, close_value)$p.value,
.groups = "drop"
) %>%
pivot_wider(names_from = top_10, values_from = c(correlation, p_value)) %>%
gt() %>%
tab_spanner(
label = "With Top 10",
columns = c(correlation_1, p_value_1)
) %>%
tab_spanner(
label = "Without Top 10",
columns = c(correlation_0, p_value_0)
) %>%
data_color(
columns = 2:3, colors = c_color
) %>%
tab_header(
title = "Pearson Correlation Test - Tweet Counts VS Close Value"
) %>%
cols_label(
ticker_symbol = "Company",
correlation_0 = "cor. coef.",
p_value_0 = "p value",
correlation_1 = "cor. coef.",
p_value_1 = "p value"
)
| Pearson Correlation Test - Tweet Counts VS Close Value | ||||
|---|---|---|---|---|
| Company | Without Top 10 | With Top 10 | ||
| cor. coef. | p value | cor. coef. | p value | |
| AAPL | -0.18638820 | 1.107931e-15 | -0.33616584 | 2.685836e-49 |
| AMZN | 0.15933702 | 7.495504e-12 | 0.11292944 | 1.312526e-06 |
| GOOG | -0.50675726 | 1.194798e-119 | -0.53966302 | 1.866146e-138 |
| GOOGL | -0.10109192 | 1.504936e-05 | -0.05087817 | 2.970280e-02 |
| MSFT | 0.08824097 | 1.597043e-04 | -0.13687871 | 4.287067e-09 |
| TSLA | 0.24088396 | 1.620535e-25 | 0.24239616 | 7.935608e-26 |
AMZN and GOOGL are the only companies that improved in correlation by removing the top 10 users.
Meaning, top 10 users had not much affect in finding association between tweet counts and close values.
Not only that, the correlation coefficients indicated insignificance for every companies except GOOG.
m_twt %>%
mutate(top_10 = ifelse(top_10 == 1, "With Top 10", "Without Top 10")) %>%
group_by(ticker_symbol) %>%
mutate(vol_c = sum(tweet_ct) / sum(volume)) %>%
ggplot(aes(x = post_date)) +
geom_line(aes(y = tweet_ct, color = "Tweet Count")) +
geom_line(aes(y = volume * vol_c, color = "Traded Volume")) +
#max close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter(volume == max(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, #label y location
label = format(volume, big.mark = ",", scientific = FALSE)),
hjust = 0.2, vjust = -0.2,
label.size = NA,
alpha = 0.6
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (volume == max(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, color = "Max Trade Volume")
) +
#min close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter( volume== min(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, #label y location
label = format(volume, big.mark = ",", scientific = FALSE)),
hjust = 0.7, vjust = -0.2,
label.size = NA,
alpha = 0.6
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (volume == min(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, color = "Min Trade Volume")
) +
scale_color_manual(values = c("Tweet Count" = "dodgerblue", "Traded Volume" = "darkred", "Max Trade Volume" = "red", "Min Trade Volume" = "blue")) +
facet_grid(rows = vars(ticker_symbol), cols = vars(top_10), scales = "free") +
labs(
title = "Tweet Counts VS Traded Volume",
x = "Year Month",
y = "Tweet Counts",
color = "Legend"
) +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
legend.position = "top"
)
m_twt %>%
group_by(ticker_symbol, top_10) %>%
summarize(
correlation = cor.test(tweet_ct, volume)$estimate,
p_value = cor.test(tweet_ct, volume)$p.value,
.groups = "drop"
) %>%
pivot_wider(names_from = top_10, values_from = c(correlation, p_value)) %>%
gt() %>%
tab_spanner(
label = "With Top 10",
columns = c(correlation_1, p_value_1)
) %>%
tab_spanner(
label = "Without Top 10",
columns = c(correlation_0, p_value_0)
) %>%
data_color(
columns = 2:3, colors = c_color
) %>%
tab_header(
title = "Pearson Correlation Test - Tweet Counts VS Traded Volume"
) %>%
cols_label(
ticker_symbol = "Company",
correlation_0 = "cor. coef.",
p_value_0 = "p value",
correlation_1 = "cor. coef.",
p_value_1 = "p value"
)
| Pearson Correlation Test - Tweet Counts VS Traded Volume | ||||
|---|---|---|---|---|
| Company | Without Top 10 | With Top 10 | ||
| cor. coef. | p value | cor. coef. | p value | |
| AAPL | 0.5480517 | 3.990960e-143 | 0.4683100 | 8.231821e-100 |
| AMZN | 0.3654463 | 8.484498e-59 | 0.3370077 | 9.811757e-50 |
| GOOG | 0.3635723 | 3.852711e-58 | 0.2557749 | 1.193685e-28 |
| GOOGL | 0.4115536 | 1.420638e-75 | 0.4030247 | 2.887951e-72 |
| MSFT | 0.2832046 | 5.045231e-35 | 0.1605811 | 5.125945e-12 |
| TSLA | 0.6639286 | 1.695868e-232 | 0.6655971 | 4.507415e-234 |
For Volume VS Tweet Count, the correlation improved overall without the top_10.
With the top 10, every companies except GOOG and MSFT showed moderate correlation.
Without, MSFT is the only company that has a weak correlation.
Now, I’d like to compare whether mult_tag and reg_user columns will affect the correlation.
m_twt_m <- twt_n_cmpy %>%
group_by(post_date, ticker_symbol, mult_tag) %>%
summarize(
tweet_ct = n_distinct(tweet_id),
writer_ct = n_distinct(writer),
.groups = "drop"
) %>%
left_join(
stk, by = c("post_date" = "day_date", "ticker_symbol" = "ticker_symbol")
)
m_twt_m %>%
mutate(mult_tag = ifelse(mult_tag == 1, "Mult. Tag", "Single Tag")) %>%
group_by(ticker_symbol) %>%
mutate(cval_c = sum(tweet_ct) / sum(close_value)) %>%
ggplot(aes(x = post_date)) +
geom_line(aes(y = tweet_ct, color = "Tweet Count")) +
geom_line(aes(y = close_value * cval_c, color = "Close Value $")) +
#max close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter(close_value == max(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, #label y location
label = paste("$", format(close_value, big.mark = ",", scientific = FALSE), sep = "")),
hjust = 0.7, vjust = -0.2,
label.size = NA,
alpha = 0.5
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (close_value == max(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, color = "Max Close Value $")
) +
#min close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter(close_value == min(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, #label y location
label = paste("$", format(close_value, big.mark = ",", scientific = FALSE), sep = "")),
hjust = 0.3, vjust = -0.2,
label.size = NA,
alpha = 0.5
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (close_value == min(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, color = "Min Close Value $")
) +
scale_y_continuous(labels=function(x) {format(x,big.mark = ",", scientific = FALSE)}) +
scale_color_manual(values = c("Tweet Count" = "dodgerblue", "Close Value $" = "darkgreen", "Max Close Value $" = "red", "Min Close Value $" = "blue")) +
facet_grid(rows = vars(ticker_symbol), cols = vars(mult_tag), scales = "free") +
labs(
title = "Tweet Counts VS Close Value (by mult_tag)",
x = "Year Month",
y = "Tweet Counts",
color = "Legend"
) +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
legend.position = "top"
)
m_twt_m %>%
group_by(ticker_symbol, mult_tag) %>%
summarize(
correlation = cor.test(tweet_ct, close_value)$estimate,
p_value = cor.test(tweet_ct, close_value)$p.value,
.groups = "drop") %>%
pivot_wider(names_from = mult_tag, values_from = c(correlation, p_value)) %>%
gt() %>%
tab_spanner(
label = "Multiple Tag",
columns = c(correlation_1, p_value_1)
) %>%
tab_spanner(
label = "Single Tag",
columns = c(correlation_0, p_value_0)
) %>%
tab_header(
title = "Pearson Correlation Test - Tweet Counts VS Close Value (by mult_tag)"
)%>%
data_color(columns = 2:3, colors = c_color) %>%
cols_label(
ticker_symbol = "Company",
correlation_0 = "cor. coef.",
p_value_0 = "p value",
correlation_1 = "cor. coef.",
p_value_1 = "p value"
)
| Pearson Correlation Test - Tweet Counts VS Close Value (by mult_tag) | ||||
|---|---|---|---|---|
| Company | Single Tag | Multiple Tag | ||
| cor. coef. | p value | cor. coef. | p value | |
| AAPL | -0.37811571 | 6.597217e-63 | 0.1432558 | 8.393398e-10 |
| AMZN | -0.02512403 | 2.832590e-01 | 0.4700031 | 5.369775e-101 |
| GOOG | -0.55022291 | 6.049779e-145 | -0.3372858 | 8.588846e-50 |
| GOOGL | -0.17520767 | 4.692703e-14 | 0.1642226 | 1.679970e-12 |
| MSFT | -0.22335439 | 4.455721e-22 | 0.2316917 | 1.118422e-23 |
| TSLA | 0.24035278 | 2.080027e-25 | 0.1595901 | 6.939524e-12 |
Compared to the original, correlation for single tagged data showed stronger correlation for all companies except for AMZN.
It could be that only considering the single tagged tweets to find the association with the close value is more beneficial.
Although, AMZN seems to have the highest correlation coefficient value when only the multi-tagged tweets were considered.
m_twt_m %>%
mutate(mult_tag = ifelse(mult_tag == 1, "Mult. Tag", "Single Tag")) %>%
group_by(ticker_symbol) %>%
mutate(vol_c = sum(tweet_ct) / sum(volume)) %>%
ggplot(aes(x = post_date)) +
geom_line(aes(y = tweet_ct, color = "Tweet Count")) +
geom_line(aes(y = volume * vol_c, color = "Traded Volume")) +
#max close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter(volume == max(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, #label y location
label = format(volume, big.mark = ",", scientific = FALSE)),
hjust = 0.2, vjust = -0.2,
label.size = NA,
alpha = 0.6
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (volume == max(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, color = "Max Trade Volume")
) +
#min close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter( volume== min(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, #label y location
label = format(volume, big.mark = ",", scientific = FALSE)),
hjust = 0.7, vjust = -0.2,
label.size = NA,
alpha = 0.6
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (volume == min(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, color = "Min Trade Volume")
) +
scale_color_manual(values = c("Tweet Count" = "dodgerblue", "Traded Volume" = "darkred", "Max Trade Volume" = "red", "Min Trade Volume" = "blue")) +
facet_grid(rows = vars(ticker_symbol), cols = vars(mult_tag), scales = "free") +
labs(
title = "Tweet Counts VS Traded Volume (by mult_tag)",
x = "Year Month",
y = "Tweet Counts",
color = "Legend"
) +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
legend.position = "top"
)
m_twt_m %>%
group_by(ticker_symbol, mult_tag) %>%
summarize(
correlation = cor.test(tweet_ct, volume)$estimate,
p_value = cor.test(tweet_ct, volume)$p.value,
.groups = "drop") %>%
pivot_wider(names_from = mult_tag, values_from = c(correlation, p_value)) %>%
gt() %>%
tab_spanner(
label = "Multiple Tag",
columns = c(correlation_1, p_value_1)
) %>%
tab_spanner(
label = "Single Tag",
columns = c(correlation_0, p_value_0)
) %>%
tab_header(
title = "Pearson Correlation Test - Tweet Counts VS Traded Volume (by mult_tag)"
)%>%
data_color(columns = 2:3, colors = c_color) %>%
cols_label(
ticker_symbol = "Company",
correlation_0 = "cor. coef.",
p_value_0 = "p value",
correlation_1 = "cor. coef.",
p_value_1 = "p value"
)
| Pearson Correlation Test - Tweet Counts VS Traded Volume (by mult_tag) | ||||
|---|---|---|---|---|
| Company | Single Tag | Multiple Tag | ||
| cor. coef. | p value | cor. coef. | p value | |
| AAPL | 0.4729841 | 4.800912e-102 | 0.1832003 | 3.412567e-15 |
| AMZN | 0.3134335 | 6.543233e-43 | 0.2815531 | 1.278818e-34 |
| GOOG | 0.2134986 | 2.945432e-20 | 0.2843090 | 2.816178e-35 |
| GOOGL | 0.4022377 | 5.767563e-72 | 0.3070734 | 3.765652e-41 |
| MSFT | 0.1343529 | 8.246228e-09 | 0.1657465 | 1.024804e-12 |
| TSLA | 0.6685823 | 6.451042e-237 | 0.3527989 | 1.187685e-54 |
Compared to the original, neither of the tags improved the correlation compared to the original.
It stayed overall consistent when for the Single Tag.
But for the multi-tagged, the companies that showed weaker correlation originally (cor less than 0.3, like GOOG, and MSFT) showed slightly stronger correlation.
m_twt_r <- twt_n_cmpy %>%
group_by(post_date, ticker_symbol, reg_user) %>%
summarize(
tweet_ct = n_distinct(tweet_id),
writer_ct = n_distinct(writer),
.groups = "drop"
) %>%
left_join(
stk, by = c("post_date" = "day_date", "ticker_symbol" = "ticker_symbol")
)
m_twt_r %>%
mutate(reg_user = ifelse(reg_user == 1, "Reg", "Irreg")) %>%
group_by(ticker_symbol) %>%
mutate(cval_c = sum(tweet_ct) / sum(close_value)) %>%
ggplot(aes(x = post_date)) +
geom_line(aes(y = tweet_ct, color = "Tweet Count")) +
geom_line(aes(y = close_value * cval_c, color = "Close Value $")) +
#max close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter(close_value == max(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, #label y location
label = paste("$", format(close_value, big.mark = ",", scientific = FALSE), sep = "")),
hjust = 0.7, vjust = -0.2,
label.size = NA,
alpha = 0.5
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (close_value == max(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, color = "Max Close Value $")
) +
#min close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter(close_value == min(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, #label y location
label = paste("$", format(close_value, big.mark = ",", scientific = FALSE), sep = "")),
hjust = 0.3, vjust = -0.2,
label.size = NA,
alpha = 0.5
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (close_value == min(close_value)) %>%
filter(post_date == min(post_date)),
aes(y = close_value * cval_c, color = "Min Close Value $")
) +
scale_y_continuous(labels=function(x) {format(x,big.mark = ",", scientific = FALSE)}) +
scale_color_manual(values = c("Tweet Count" = "dodgerblue", "Close Value $" = "darkgreen", "Max Close Value $" = "red", "Min Close Value $" = "blue")) +
facet_grid(rows = vars(ticker_symbol), cols = vars(reg_user), scales = "free") +
labs(
title = "Tweet Counts VS Close Value (by reg_user)",
x = "Year Month",
y = "Tweet Counts",
color = "Legend"
) +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
legend.position = "top"
)
m_twt_r %>%
group_by(ticker_symbol, reg_user) %>%
summarize(
correlation = cor.test(tweet_ct, close_value)$estimate,
p_value = cor.test(tweet_ct, close_value)$p.value,
.groups = "drop") %>%
pivot_wider(names_from = reg_user, values_from = c(correlation, p_value)) %>%
gt() %>%
tab_spanner(
label = "Regular User",
columns = c(correlation_1, p_value_1)
) %>%
tab_spanner(
label = "Irregular User",
columns = c(correlation_0, p_value_0)
) %>%
tab_header(
title = "Pearson Correlation Test - Tweet Counts VS Close Value (by reg_user)"
)%>%
data_color(columns = 2:3, colors = c_color) %>%
cols_label(
ticker_symbol = "Company",
correlation_0 = "cor. coef.",
p_value_0 = "p value",
correlation_1 = "cor. coef.",
p_value_1 = "p value"
)
| Pearson Correlation Test - Tweet Counts VS Close Value (by reg_user) | ||||
|---|---|---|---|---|
| Company | Irregular User | Regular User | ||
| cor. coef. | p value | cor. coef. | p value | |
| AAPL | 0.14032376 | 2.134688e-09 | -0.35195135 | 3.529292e-54 |
| AMZN | 0.08389052 | 3.750697e-04 | 0.11263721 | 1.398289e-06 |
| GOOG | -0.15535687 | 6.479324e-11 | -0.53852425 | 9.057801e-138 |
| GOOGL | -0.15594237 | 3.406086e-10 | -0.04504626 | 5.428405e-02 |
| MSFT | -0.04485173 | 6.795772e-02 | -0.13925819 | 2.289551e-09 |
| TSLA | 0.27264674 | 9.582652e-32 | 0.23954361 | 3.038830e-25 |
For Tweet Counts VS Close Value, Reg tag seems to not affect the correlation when compared to the original.
And the Irreg tag has lowered the correlation for every companies except for TSLA which has increased slightly bit.
m_twt_r %>%
mutate(reg_user = ifelse(reg_user == 1, "Reg", "Irreg")) %>%
group_by(ticker_symbol) %>%
mutate(vol_c = sum(tweet_ct) / sum(volume)) %>%
ggplot(aes(x = post_date)) +
geom_line(aes(y = tweet_ct, color = "Tweet Count")) +
geom_line(aes(y = volume * vol_c, color = "Traded Volume")) +
#max close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter(volume == max(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, #label y location
label = format(volume, big.mark = ",", scientific = FALSE)),
hjust = 0.2, vjust = -0.2,
label.size = NA,
alpha = 0.6
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (volume == max(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, color = "Max Trade Volume")
) +
#min close_value label
geom_label(
data = . %>%
group_by(ticker_symbol) %>%
filter( volume== min(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, #label y location
label = format(volume, big.mark = ",", scientific = FALSE)),
hjust = 0.7, vjust = -0.2,
label.size = NA,
alpha = 0.6
) +
geom_point(
data = . %>%
group_by(ticker_symbol) %>%
filter (volume == min(volume)) %>%
filter(post_date == min(post_date)),
aes(y = volume * vol_c, color = "Min Trade Volume")
) +
scale_color_manual(values = c("Tweet Count" = "dodgerblue", "Traded Volume" = "darkred", "Max Trade Volume" = "red", "Min Trade Volume" = "blue")) +
facet_grid(rows = vars(ticker_symbol), cols = vars(reg_user), scales = "free") +
labs(
title = "Tweet Counts VS Traded Volume (by reg_user)",
x = "Year Month",
y = "Tweet Counts",
color = "Legend"
) +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
legend.position = "top"
)
m_twt_r %>%
group_by(ticker_symbol, reg_user) %>%
summarize(
correlation = cor.test(tweet_ct, volume)$estimate,
p_value = cor.test(tweet_ct, volume)$p.value,
.groups = "drop") %>%
pivot_wider(names_from = reg_user, values_from = c(correlation, p_value)) %>%
gt() %>%
tab_spanner(
label = "Regular User",
columns = c(correlation_1, p_value_1)
) %>%
tab_spanner(
label = "Irregular User",
columns = c(correlation_0, p_value_0)
) %>%
tab_header(
title = "Pearson Correlation Test - Tweet Counts VS Traded Volume (by reg_user)"
)%>%
data_color(columns = 2:3, colors = c_color) %>%
cols_label(
ticker_symbol = "Company",
correlation_0 = "cor. coef.",
p_value_0 = "p value",
correlation_1 = "cor. coef.",
p_value_1 = "p value"
)
| Pearson Correlation Test - Tweet Counts VS Traded Volume (by reg_user) | ||||
|---|---|---|---|---|
| Company | Irregular User | Regular User | ||
| cor. coef. | p value | cor. coef. | p value | |
| AAPL | 0.2330860 | 1.076247e-23 | 0.4663589 | 6.884870e-99 |
| AMZN | 0.2890058 | 7.447800e-36 | 0.3350305 | 3.868603e-49 |
| GOOG | 0.1390714 | 5.208981e-09 | 0.2521784 | 7.177336e-28 |
| GOOGL | 0.3038181 | 1.330564e-35 | 0.3996570 | 5.501083e-71 |
| MSFT | 0.2229513 | 4.134225e-20 | 0.1558352 | 2.149519e-11 |
| TSLA | 0.6104662 | 1.831110e-182 | 0.6646295 | 3.705135e-233 |
Correlation for Traded Volume VS Tweet Count stayed consistent with the tag as well.
And just as result above, Irreg has lowered the correlation for every companies except for MSFT.
Through the analysis, we were able to identify some association between the daily traded volume and the tweet counts for all companies except for MSFT and GOOG.
Also, dividing the tweets by mult_tag and reg_user did not improve on the correlation, although the Single Tag and Regular User resulted in similar values from the original.
As for the close value and tweet counts, AAPL and GOOG are the only companies that showed any significant association.
Comparing to the single tagged tweets increased the correlation, but only slightly where most of them were still considered insignificant.